# 1、创建环境
from pyspark.context import SparkContext

sc = SparkContext(master='local', appName='join')

names_rdd = sc.parallelize(
    [
        ('002', '李四'),
        ('003', '王五'),
        ('004', '赵六'),
        ('005', '豆包')
    ]
)

ages_rdd = sc.parallelize(
    [
        ('001', 23),
        ('002', 24),
        ('003', 25),
        ('004', 26)
    ]
)

# 1 inner join
inner_join_rdd = names_rdd.join(ages_rdd)

inner_join_rdd.foreach(print)

# 2 left join
left_join_rdd = names_rdd.leftOuterJoin(ages_rdd)


def map_fun(line):
    id = line[0]
    name = line[1][0]
    age = line[1][1] if line[1][1] is not None else 0
    return id, name, age

# 整理数据
left_join_rdd.map(map_fun).foreach(print)


# 3 full join
names_rdd.fullOuterJoin(ages_rdd).foreach(print)
